## ********************************
##      B1 Full Sample Regression       
##        Hao Zhang & Ye Zhang
##             2025/7/24
## ********************************

# load packages
library(readstata13)
library(lfe)
library(stargazer)
library(tidyverse)
library(doBy)
library(data.table)
library(DescTools)
library(haven)

# set working directory
setwd("../raw data")

# read in data
options(scipen = 100)
firm <- read.dta13("cies 1998-2007.dta")
firm <- firm %>% mutate(city = as.numeric(substr(as.character(city), 0, 6)))

# ownership classification
firm$ownership <- as.numeric(firm$ownership)
firm <- firm %>% mutate(soe = ifelse(ownership == 110 | ownership == 151, 1, 0),
                        hmt = ifelse(ownership > 199 & ownership < 299, 1, 0),
                        fie = ifelse(ownership > 299, 1, 0), 
                        private = ifelse(soe == 0 & hmt == 0 & fie == 0, 1, 0))

# create insurance rate (payment/previous wage)
firm_base <- firm %>% select(city, year, frdm, wage) %>%
  filter(wage > 0) %>% mutate(year = year + 1)
firm <- firm %>% filter(insurance >= 0)
firm <- left_join(firm, firm_base, by = c("city", "year", "frdm"), relationship = "many-to-many") # a very small number of duplicates due to within-city branches
firm <- firm %>% mutate(rate1 = insurance/wage.y, rate2 = medical/wage.y) %>% 
  mutate(wage = wage.x)
firm$rate1 <- Winsorize(firm$rate1, quantile(firm$rate1, probs = c(0.05, 0.95), na.rm = TRUE))
firm$rate2 <- Winsorize(firm$rate2, quantile(firm$rate2, probs = c(0.05, 0.95), na.rm = TRUE))

# create insurance dummy
firm <- firm %>% mutate(insurance_dummy = ifelse(insurance == 0, 0, 1), 
                        medical_dummy = ifelse(medical == 0, 0, 1))

# create industry indicator and filter for years and active firms
firm <- firm %>% mutate(sector = as.integer(industry/100)) %>% 
  filter(employment > 6, year >= 2001)

# clean city panel and city leadership (focus on non-provincial capitals)
city <- read.dta13("city panel 1995-2010.dta")
city <- city %>% filter(substr(as.character(city$citycode), 3, 4) != "01")
city1 <- city %>% select(citycode, year, gdp) %>% mutate(year = year + 1)
city <- left_join(city, city1, by = c("citycode", "year"))
city$gdpgrowth <- city$gdp.x/city$gdp.y - 1
city <- city %>% mutate(province = as.integer(citycode/100)) %>% 
  filter(province != 11, province != 12, province != 31, citycode != 5102) %>%
  group_by(province, year) %>%
  mutate(gdpgrowth_mean = mean(gdpgrowth, na.rm = TRUE)) %>% 
  mutate(year = year + 1) %>% #delete the threshold for full sample
  mutate(gdp_pressure = gdpgrowth_mean - gdpgrowth) %>%
  rename(gdp = gdp.x)
mayor <- read.dta13("city mayor 1990-2015.dta") %>% 
  select(citycode, year, code, begin, end, educ, ethnicity, gender, birth_year, seq)

# add tax collection, unemployment rate, and medical resources
city1 <- readxl::read_xlsx("city panel 1996-2014.xlsx") # warnings due to conversion from text to number for the year variable for districts and provinces
citycode <- readxl::read_xlsx("citycode.xlsx") %>%
  mutate(citycode = as.integer(citycode/100)) %>% distinct()
city1 <- left_join(city1, citycode, by = "city", relationship = "many-to-many") %>% distinct() %>%
  filter(year >= 1999 & year <= 2007) # a small number of duplicates due to changing codes
city <- left_join(city, city1, by = c("citycode", "year"))

# add variables
firm <- firm %>% mutate(citycode = as.integer(city/100))
firm <- left_join(firm, city, by = c("citycode", "year"))
firm <- left_join(firm, mayor, by = c("citycode", "year"))
firm$count <- 1
number <- summaryBy(count ~ citycode + year, data = firm, FUN = sum, na.rm=TRUE)
firm <- left_join(firm, number, by = c("citycode", "year"))

# retain main dataset for firm-level analysis
rm(list=setdiff(ls(), "firm"))

# create variables
firm <- firm %>% mutate(capacity = (tax_revenue*100000 - profits)/gdp/10, 
                        deficit = expenditure - revenue_city, 
                        unemp_rate = unemployment/population) 

# label collection authority changes
firm <- firm %>% mutate(province = as.integer(citycode/100)) %>% 
  filter(year >= 2001) %>%
  mutate(tax = ifelse(province == 33 & year >= 2004 | province == 13 & year >= 2002 | 
                        province == 42 & year >= 2002 | province == 44 & year >= 2002 | 
                        province == 32 & year >= 2005 | province == 53 & year >= 2006, 1, 0))

# run inverse hyperbolic sine
ihs <- function(x) {
  y <- log(x + sqrt(x ^ 2 + 1))
  return(y)
}

# unemployment insurance
m1 <- felm(insurance_dummy ~ gdp_pressure + gdpgrowth + log(gdp/population) + 
             ihs(deficit/gdp) + ihs(capacity) + tax + log(FDI + 1) + 
             ihs(wage) + ihs(employment) + ihs(output1) + ihs(profit) | 
             frdm + year + educ + ethnicity  + gender + 
             begin + connection + seq | 0 | citycode, data = firm)

firm1 <- firm %>% filter(rate1 > 0)
m2 <- felm(log(rate1) ~ gdp_pressure + gdpgrowth + log(gdp/population) + 
             ihs(deficit/gdp) + ihs(capacity) + tax + log(FDI + 1) + 
             ihs(wage) + ihs(employment) + ihs(output1) + ihs(profit) | 
             frdm + year + educ + ethnicity  + gender + 
             begin + connection + seq | 0 | citycode, data = firm1)

# medical insurance
m3 <- felm(medical_dummy ~ gdp_pressure + gdpgrowth + log(gdp/population) + 
             ihs(deficit/gdp) + ihs(capacity) + tax + log(FDI + 1) + 
             ihs(wage) + ihs(employment) + ihs(output1) + ihs(profit) | 
             frdm + year + educ + ethnicity  + gender + 
             begin + connection + seq | 0 | citycode, data = firm)

firm2 <- firm %>% filter(rate2 > 0)
m4 <- felm(log(rate2) ~ gdp_pressure + gdpgrowth + log(gdp/population) + 
             ihs(deficit/gdp) + ihs(capacity) + tax + log(FDI + 1) + 
             ihs(wage) + ihs(employment) + ihs(output1) + ihs(profit) | 
             frdm + year + educ + ethnicity  + gender + 
             begin + connection + seq | 0 | citycode, data = firm2)

# output regression table (omit four firm-level variables)
stargazer(m1, m2, m3, m4, title = "Interjurisdictional Competition and Social Insurances (Firm Level)", 
          dep.var.labels = c("Unemployment", "Unemployment", "Health and Pension", "Health and Pension"),
          omit.stat = c("rsq", "adj.rsq", "ser"), 
          font.size = "small", 
          add.lines = list(c("Mayor Characteristics", "Y", "Y", "Y", "Y"),
                           c("Firm Characteristics", "Y", "Y", "Y", "Y"), 
                           c("Firm Fixed Effects", "Y", "Y", "Y", "Y"), 
                           c("Year Fixed Effects", "Y", "Y", "Y", "Y")))
